import redis
from scrapy.exceptions import DropItem
from model import *
# 数据清洗
class DataCleanPipeline(object):
    # 管道处理数据的方法
    def process_item(self,item,spider):
        # 去除首尾空字符
        for key in item:
            if type(item[key])==str:
                item[key]=item[key].strip()
        item['read_num'] = item.get('read_num',0)  # 阅读数
        item['reply_num'] = item.get('reply_num', 0)  # 评论数
        return item

# 数据过滤
class DataDropPipeline(object):
    # 链接redis，记录href
    def __init__(self):
        rdp =redis.ConnectionPool(host='127.0.0.1', port=6379)
        self.rdc =redis.StrictRedis(connection_pool=rdp )

    def process_item(self,item,spider):
        # 判断是否数据为空
        if item['title']=='' or item['contents']=='' or self.rdc.sismember('blog',item['href']):
            # 如果满足条件，则提出item数据
            raise DropItem('Duplicate item found %s'%item)
        else:
            # 数据加入redis
            self.rdc.sadd('blog',item['href'])
            return item

# 数据存储
class DataMysqlSavePipeline(object):
    # 当爬虫启动的时候执行
    def open_spider(self,spider):
        DB_Util.init_db()# 表不存在的时候，初始化表结构

    # 数据存储
    def process_item(self,item,spider):
        session=DB_Util.get_session()
        # 查询作者的id
        # 如果作者id不存在则，创建对象
        author_obj=Author.query.filter_by(name=item['author'])
        if author_obj:
            author_id=author_obj.Id
        else:
            author_obj=Author()
            author_obj.name=item['author']
            session.add(author_obj)
            session.commit()
            author_id=author_obj.Id

        # 同理分类
        category_obj = Category.query.filter_by(name=item['category'])
        if category_obj:
            category_id = category_obj.Id
        else:
            category_obj = Category()
            category_obj.name = item['author']
            session.add(category_obj)
            session.commit()
            category_id = category_obj.Id

        blog=Blog()
        blog.title=item['title']
        blog.contents=item['contents']
        blog.author=author_id
        blog.category=category_id
        blog.labels = item['labels']  # 标签
        blog.fbrq = item['fbrq']  # 发布日期
        blog.read_num =item['read_num']# 阅读数
        blog.reply_num =item['reply_num']  # 评论数
        blog.href = item['href'] # 文章链接
        blog.source = item['source']  # 来源

        session.add(blog)
        session.commit()
        return item